The purpose of this file is processing the combined data files for Summer 2022 into files that contain only valid data for analysis, excluding invalid sessions and participants

Data is imported from 2 files, indicating two levels of analysis: participants and blocks (item-level).

Note: mouse-cursor data contained in final_mouse_blocks.json file is not handled here.

#IMPORT DATA
df_participants <- fromJSON("input/su22_sgc4c_final_participants.json")
df_items <- fromJSON('input/su22_sgc4c_final_items.json')

#add term indicator
df_participants$term <- "summer22"
df_items$term <- "summer22"

#DEFINE SGC_4C validity crieria
sessions <- c('suPROLIFIC') #SGC4C running on prolific
conditions <-c("113112","11311221", "11311331", "111112", "11111221", "11111331") 
violation_threshold = 5.5 #number of allowable browser violations
effort_exclusion = c("I didn't try very hard, or rushed through the questions", "I started out trying hard, but gave up at some point")
n_items = 15 #fifteen items is complete dataset per participant

#placeholder for excluding participants
ex_participants = data.frame()

note : We drop all scores calculated in the stimulus engine (except absolute score, which uses simple # strictly correct), as they are recalculate during analysis using a different MC scoring algorithm.

#create factors in PARTICIPANTS
df_participants <- df_participants %>%  
  mutate( #create factors and remove extraneous ""
    subject=as.character(subject),
    condition=as.character(condition),
    pretty_condition = recode_factor(condition, 
                                     "113112" = "TRI-rotate-45", "111112" =  "ORTH-rotate-45",
                                     "11311221" = "TRI-rotate-45", "11111221" =  "ORTH-rotate-45",
                                     "11311331" = "TRI-rotate-90", "11111331" =  "ORTH-rotate-90",),
    study = factor(study),
    session = factor(session),
    exp_id = factor(exp_id),
    sona_id = as.character(sona_id),
    pool = factor(pool),
    mode = factor(mode),
    attn_check = factor(attn_check),
    status=factor(status),
    term=factor(term),
    gender = as.factor(gender),
    age = as.integer(age),
    country = gsub('"',"",country),
    year = factor(schoolyear),
    major = factor(major),
    browser = factor(browser),
    os = factor(os),
    native_language = factor(language),
    totaltime_m = totaltime/1000/60,
   ) %>% dplyr::select( #order cols 
    subject,
    study,
    condition,
    pretty_condition,
    session,
    exp_id,
    sona_id,
    pool,
    mode,
    attn_check,
    # explanation,
    effort,
    difficulty,
    confidence,
    enjoyment,
    other,
    age,
    country,
    language,
    schoolyear,
    major,
    gender,
    disability,
    browser,
    width,
    height,
    os,
    starttime,
    status,
    term,
    violations,
    absolute_score,
    # discriminant_score,
    # tri_score,
    # orth_score,
    # other_score,
    # blank_score,
    totaltime_m
   )  

#NOT THAT WE DROP ALL SCORES, WHICH ARE INCORRECTLY CALCULATED IN THE stimulus engine. We do not drop the raw responses (answers)
df_items <- df_items %>% 
  mutate(
    # subject=factor(subject),
    # condition=factor(condition),
 pretty_condition = recode_factor(condition, 
                                     "113112" = "TRI-rotate-45", "111112" =  "ORTH-rotate-45",
                                     "11311221" = "TRI-rotate-45", "11111221" =  "ORTH-rotate-45",
                                     "11311331" = "TRI-rotate-90", "11111331" =  "ORTH-rotate-90",),    pool=factor(pool),
    mode = factor(mode),
    # explicit=factor(explicit),
    # impasse = factor(impasse),
    # grid = factor(grid),
    # mark = factor(mark),
    # ixn = factor(ixn),
    term=factor(term),
    relation = factor(relation),
    block = factor(block),
    correct = factor(correct),
    q=factor(q),
    rt_s = rt/1000,
    time_elapsed_m = time_elapsed/1000/60
  ) %>% dplyr::select(
     subject,
     study,
     term,
     pool,
     mode,
     condition,
     pretty_condition,
     block,
     explicit,
     impasse,
     grid,
     mark,
     ixn,
     gwidth,
     gheight,
     graph,
     time_elapsed_m,
     question,
     relation,
     q,
     correct,
     # discriminant,
     # tri_score,
     # orth_score,
     # other_score,
     # blank_score,
     answer,
     rt_s
   )  #WE DROP ALL SCORES BC THEY ARE RESCORED IN ANALYSIS FILE

1 Data Validation

1.1 Exclusions

1.1.1 Completion Status

Starting with Winter 2022, data are saved to the database even if the subject’s browser did not meet minimum specifications (at which point they are prompted to change browsers, or end the study). This allows us to learn about the browsers, screen sizes and OS that (potential) subjects are using. However, these data are not exported from the database for analysis (see flatten.js and status.js scripts). Thus, only subjects who successfully completed the entire study are included in this file.

#MANUALLY INSPECT status
df_participants %>% group_by(status) %>% 
  dplyr::summarize(n=n())
## # A tibble: 1 × 2
##   status      n
##   <fct>   <int>
## 1 success   279

279 successfully completed the study.

#DISCARD participants from invalid sessions 
exclude_status <- df_participants %>% 
          filter(status != "success") %>% 
          mutate(reason="invalid-status")

ex_participants <- rbind(ex_participants, exclude_status)
rm(exclude_status)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

No data need to be excluded on account of completion status.

1.1.2 Conditions

Participants are randomly assigned to an experimental condition when starting the study. Here we validate that only conditions for the current study are included in this dataset.

#MANUALLY INSPECT conditions
df_participants %>% group_by(condition) %>% 
  dplyr::summarize(n=n())
## # A tibble: 6 × 2
##   condition     n
##   <chr>     <int>
## 1 111112       17
## 2 11111221     61
## 3 11111331     66
## 4 113112       21
## 5 11311221     45
## 6 11311331     69
df_participants %>% group_by(pretty_condition) %>% 
  dplyr::summarize(n=n())
## # A tibble: 4 × 2
##   pretty_condition     n
##   <fct>            <int>
## 1 TRI-rotate-45       66
## 2 ORTH-rotate-45      78
## 3 TRI-rotate-90       69
## 4 ORTH-rotate-90      66

Data from conditions not corresponding to valid conditions should be discarded.

#DISCARD participants from conditions invalid for this study
exclude_condition <- df_participants %>% 
          filter(!condition %in% conditions) %>% 
          mutate(reason="invalid-condition")

ex_participants <- rbind(ex_participants, exclude_condition)
rm(exclude_condition)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

No data need to be excluded on account of condition.

1.1.3 Sessions

The (string) session code is embedded in the URL querystring by the experimenter to differentiate testing sessions in SONA from demo and other environment setup tasks.

#MANUALLY INSPECT sessions
df_participants %>% group_by(session) %>% 
  dplyr::summarize(n=n())
## # A tibble: 2 × 2
##   session        n
##   <fct>      <int>
## 1 su22sona       1
## 2 suPROLIFIC   278

Data from sessions not corresponding to valid sessions should be discarded.

#DISCARD participants from invalid sessions 
exclude_session <- df_participants %>% 
          filter(!session %in% sessions) %>% 
          mutate(reason="invalid-session")

ex_participants <- rbind(ex_participants, exclude_session)
rm(exclude_session)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

One participant record was excluded on account of session (ie. app testing or pilot session).

1.1.4 Browser Interaction Violations

Browser interaction data is recorded by jspsych allowing us to determine if subjects violate our instructions not to leave the browser tab (or exit fullscreen mode) during test. These incidents are recorded in jspsych interaction data object, and the number of violations is counted and added to the participant data file.

Due to eccentricity of the browser events captured, 1-2 browser violations can be captured even if the subject did not leave the browser window (eg. in case of resizing window to meet minimum requirements.)

#MANUALLY INSPECT violations
df_participants %>% group_by(violations) %>% 
  dplyr::summarize(n=n())
## # A tibble: 19 × 2
##    violations     n
##         <dbl> <int>
##  1        1     163
##  2        1.5     3
##  3        2      48
##  4        2.5     5
##  5        3      27
##  6        3.5     6
##  7        4       7
##  8        4.5     1
##  9        5       3
## 10        5.5     3
## 11        6       4
## 12        6.5     1
## 13       10.5     1
## 14       11.5     1
## 15       13       1
## 16       13.5     1
## 17       16       1
## 18       17.5     1
## 19       20       1
#DISCARD participants exceeding the threshold of browser interaction violations 
exclude_violations <- df_participants %>% 
          filter(violations > violation_threshold) %>% 
          mutate(reason="exceeded-violations")

ex_participants <- rbind(ex_participants, exclude_violations)
rm(exclude_violations)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

12 participants were excluded for exceeding the maximum allowed number of browser interaction violations.

1.1.5 Effort

To assist in mitigating increased noise in data collected asynchronously from the UCSD student subject pool, we added explicit ratings of how much effort the participant expended on the task. This question was implemented as a multiple-choice drop-down on an ‘Effort’ page prior to the ‘Demographics’ survey at the end of the study. Subjects were given four options : (1) I tried my best on each question, (2) I tried my best on most questions, (3) I started out trying hard, but gave up at some point, (4) I didn’t try very hard, or rushed through the questions.

#MANUALLY INSPECT effort
df_participants %>% group_by(effort) %>% 
  dplyr::summarize(n=n())
## # A tibble: 3 × 2
##   effort                                                   n
##   <chr>                                                <int>
## 1 I started out trying hard, but gave up at some point     4
## 2 I tried my best on each question                       229
## 3 I tried my best on most questions                       33

Participants answering with options I didn’t try very hard, or rushed through the questions or I started out trying hard, but gave up at some point are excluded from analysis.

#DISCARD participants who indicated they did not expend adequate effort on the study
exclude_effort <- df_participants %>% 
          filter(effort %in% effort_exclusion) %>% 
          mutate(reason="selfrated-effort")

ex_participants <- rbind(ex_participants, exclude_effort)
rm(exclude_effort)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

Four participants are excluded for low (self-rated) effort.

1.1.6 Attention Check

The 6th question in the study is non-discriminatory (can easily get correct answer regardless of strategy) and serves as an attention check question.

#MANUALLY INSPECT attention
df_participants %>% group_by(attn_check) %>% 
  dplyr::summarize(n=n())
## # A tibble: 2 × 2
##   attn_check     n
##   <fct>      <int>
## 1 FALSE         32
## 2 TRUE         230

Participants who answered the attention check question incorrectly should be excluded.

# #DISCARD participants who indicated they did not expend adequate effort on the study
exclude_attn <- df_participants %>%
          filter(attn_check == FALSE & pretty_condition =="ORTH-rotate-45") %>%
          mutate(reason="failed-attnchk")

ex_participants <- rbind(ex_participants, exclude_attn)
rm(exclude_attn)
# 
df_participants <- df_participants %>%
  filter( ! subject %in% ex_participants$subject)

Four participants are excluded for failing the attention check question.

1.1.7 Items

Next, we need to discard item_level data for excluded participants.

ex_items <- df_items %>% 
  filter (subject %in% ex_participants$subject) 

df_items <- df_items %>% 
  filter (!subject %in% ex_participants$subject )

1.2 Validation

After all exclusions, we are left with the following number of participants per condition:

#MANUALLY INSPECT conditions
df_participants %>% group_by(pretty_condition) %>% 
  dplyr::summarize(n=n())
## # A tibble: 4 × 2
##   pretty_condition     n
##   <fct>            <int>
## 1 TRI-rotate-45       61
## 2 ORTH-rotate-45      69
## 3 TRI-rotate-90       67
## 4 ORTH-rotate-90      61

Finally, we need to validate we have a complete set of items for all valid participants.

count(df_items)[[1]] == count(df_participants)[[1]]* n_items 
## [1] TRUE

2 Participants Codebook

#see https://cran.r-project.org/web/packages/codebook/vignettes/codebook_tutorial.html

#ADD VARIABLE METADATA
dict <- rio::import("input/dictionary_sgc4c_participants.csv", "csv") #import data dictionary
var_label(df_participants) <- dict %>% select(VARIABLE, DESCRIPTION) %>% dict_to_list() #add variable labels

#ADD DATASET METATDATA
metadata(df_participants)$name <- "Experimental PARTICIPANTS for study SGC4C"
metadata(df_participants)$description <- "Data for study SGC4C summarized at PARTICIPANT  level"
metadata(df_participants)$creator <- "Amy Rae Fox"
metadata(df_participants)$contact <- "amyraefox@gmail.com"
#{r, eval = checkMode() == "pdf"} #ONLY FOR PDF KNIT
codebook::skim_codebook(df_participants) 
## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".
Data summary
Name data
Number of rows 258
Number of columns 32
_______________________
Column type frequency:
character 10
factor 13
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
subject 0 1 5 5 0 258 0
condition 0 1 6 8 0 6 0
sona_id 0 1 0 24 8 250 0
effort 0 1 32 33 0 2 0
other 0 1 0 535 120 133 0
country 0 1 2 28 0 17 0
language 0 1 7 9 0 4 0
schoolyear 0 1 7 27 0 7 0
disability 0 1 0 173 82 44 0
starttime 0 1 24 24 0 258 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
study 0 1 FALSE 1 SGC: 258
pretty_condition 0 1 FALSE 4 ORT: 69, TRI: 67, TRI: 61, ORT: 61
session 0 1 FALSE 1 suP: 258, su2: 0
exp_id 0 1 FALSE 7 630: 67, 630: 61, 630: 44, 630: 42
pool 0 1 FALSE 1 pro: 258, son: 0
mode 0 1 FALSE 1 asy: 258
attn_check 0 1 FALSE 2 TRU: 230, FAL: 28
major 0 1 FALSE 7 Mat: 74, Hum: 46, Soc: 46, Fin: 37
gender 0 1 FALSE 3 Mal: 125, Fem: 124, Oth: 9
browser 0 1 FALSE 1 chr: 258
os 0 1 FALSE 6 Win: 169, Mac: 64, Chr: 10, Lin: 6
status 0 1 FALSE 1 suc: 258
term 0 1 FALSE 1 sum: 258

Variable type: numeric

skim_variable n_missing complete_rate mean sd min median max hist
difficulty 0 1 3.79 0.94 1.00 4.00 5.00 ▁▂▆▇▅
confidence 0 1 3.18 1.19 1.00 3.00 5.00 ▂▆▇▇▃
enjoyment 0 1 3.22 1.28 1.00 3.00 5.00 ▃▅▇▆▆
age 0 1 36.04 14.71 18.00 31.50 79.00 ▇▅▂▂▁
width 0 1 1639.75 363.64 1127.00 1536.00 3440.00 ▇▅▁▁▁
height 0 1 852.33 170.25 681.00 789.00 1586.00 ▇▃▁▁▁
violations 0 1 1.67 1.02 1.00 1.00 5.50 ▇▂▂▁▁
absolute_score 0 1 4.56 4.26 0.00 4.00 12.00 ▇▂▃▂▃
totaltime_m 0 1 13.84 7.61 3.22 12.39 73.05 ▇▂▁▁▁
codebook(df_participants, #ONLY FOR HTML KNIT
         metadata_table = TRUE,
         detailed_variables = FALSE,
         detailed_scales = FALSE,
         metadata_json = FALSE,
         survey_overview = FALSE,
         missingness_report = FALSE)
## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".

2.0.1 Metadata

2.0.1.1 Description

Dataset name: Experimental PARTICIPANTS for study SGC4C

Data for study SGC4C summarized at PARTICIPANT level

Metadata for search engines
  • Date published: 2022-09-05

  • Creator:

name value
1 Amy Rae Fox
x
x
subject
study
condition
pretty_condition
session
exp_id
sona_id
pool
mode
attn_check
effort
difficulty
confidence
enjoyment
other
age
country
language
schoolyear
major
gender
disability
browser
width
height
os
starttime
status
term
violations
absolute_score
totaltime_m

2.1 Codebook table

3 Items Codebook

#see https://cran.r-project.org/web/packages/codebook/vignettes/codebook_tutorial.html

#ADD VARIABLE METADATA
dict <- rio::import("input/dictionary_sgc4c_items.csv", "csv") #import data dictionary

var_label(df_items) <- dict %>% select(VARIABLE, DESCRIPTION) %>% dict_to_list() #add variable labels

#ADD DATASET METATDATA
metadata(df_items)$name <- "Experimental ITEMS for study SGC4C"
metadata(df_items)$description <- "Data for study SGC4C summarized at participant-item level"
metadata(df_items)$creator <- "Amy Rae Fox"
metadata(df_items)$contact <- "amyraefox@gmail.com"
#{r, eval = checkMode() == "pdf"} #ONLY FOR PDF EXPORT
skim_codebook(df_items) 
Data summary
Name data
Number of rows 3870
Number of columns 23
_______________________
Column type frequency:
character 11
factor 8
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
subject 0 1 5 5 0 258 0
study 0 1 5 5 0 1 0
condition 0 1 6 8 0 6 0
explicit 0 1 1 1 0 1 0
impasse 0 1 1 1 0 1 0
grid 0 1 1 1 0 2 0
mark 0 1 1 1 0 1 0
ixn 0 1 1 1 0 1 0
graph 0 1 10 10 0 1 0
question 0 1 26 87 0 15 0
answer 0 1 0 25 129 170 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
term 0 1 FALSE 1 sum: 3870
pool 0 1 FALSE 1 pro: 3870, son: 0
mode 0 1 FALSE 1 asy: 3870
pretty_condition 0 1 FALSE 4 ORT: 1035, TRI: 1005, TRI: 915, ORT: 915
block 0 1 FALSE 3 ite: 1806, ite: 1290, ite: 774
relation 0 1 FALSE 10 end: 516, mee: 516, mid: 516, sta: 516
q 0 1 FALSE 15 1: 258, 2: 258, 3: 258, 4: 258
correct 0 1 FALSE 2 FAL: 2192, TRU: 1678

Variable type: numeric

skim_variable n_missing complete_rate mean sd min median max hist
gwidth 0 1 600.00 0.00 600.00 600.00 600.00 ▁▁▇▁▁
gheight 0 1 600.00 0.00 600.00 600.00 600.00 ▁▁▇▁▁
time_elapsed_m 0 1 7.50 5.67 0.49 6.33 70.13 ▇▁▁▁▁
rt_s 0 1 40.32 49.71 0.12 25.29 1041.81 ▇▁▁▁▁
codebook(df_items,#ONLY FOR HTML EXPORT
         metadata_table = TRUE,
         detailed_variables = FALSE,
         detailed_scales = FALSE,
         metadata_json = FALSE,
         survey_overview = FALSE,
         missingness_report = FALSE)

3.0.1 Metadata

3.0.1.1 Description

Dataset name: Experimental ITEMS for study SGC4C

Data for study SGC4C summarized at participant-item level

Metadata for search engines
  • Date published: 2022-09-05

  • Creator:

name value
1 Amy Rae Fox
x
x
subject
study
term
pool
mode
condition
pretty_condition
block
explicit
impasse
grid
mark
ixn
gwidth
gheight
graph
time_elapsed_m
question
relation
q
correct
answer
rt_s

3.1 Codebook table

4 Explore

Exploration of the distribution of key response variables for validation purposes:

gf_histogram( ~absolute_score ,data = df_participants) + 
  labs(title = "SGC4C Distribution of Absolute Score")

gf_dhistogram( ~absolute_score ,data = df_participants) %>% 
  gf_facet_wrap(~pretty_condition) +
  labs(title = "SGC4C Distribution of Absolute Score (by Condition)")

gf_props(~correct, data = df_items) + 
  labs(title = "SGC4C Distribution of Item Absolute Score")

gf_props(~correct, data = df_items) %>% 
  gf_facet_wrap(~pretty_condition) + 
  labs(title = "SGC4C Distribution of Item Absolute Score (by Condition)")

gf_histogram( ~totaltime_m ,data = df_participants) + 
  labs(title = "SGC4C Distribution of Total Study Time")

gf_histogram( ~absolute_score ,data = df_participants) %>% 
  gf_facet_wrap(~pretty_condition) +
  labs(title = "SGC4C Distribution of Absolute Score")

gf_histogram(~rt_s, data = df_items) + 
  labs(title = "SGC4C Distribution of Item Response Time")

gf_jitter(totaltime_m ~ absolute_score , data = df_participants) + 
  labs(title = "SGC4C Item Response Time vs Accuracy")

4.1 PEEKING

library(ggstatsplot)
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
ggbetweenstats( data = df_participants, x = pretty_condition, y = absolute_score, 
                type = "nonparametric")

ggbarstats( data = df_items, x = correct, y = pretty_condition)

5 Data Export

5.1 Save Exclusions

For transparency, we save and identify the excluded data.

write.csv(ex_participants,"output/excluded_participants_summer22_sgc4c.csv", row.names = FALSE)
write.csv(ex_items,"output/excluded_items_summer22_sgc4c.csv", row.names = FALSE)

5.2 Analysis-Ready Files

#CSV files
write.csv(df_participants,"output/sgc4c_participants.csv", row.names = FALSE)
write.csv(df_items,"output/sgc4c_items.csv", row.names = FALSE)

#export R DATA STRUCTURES (include codebook metadata)
rio::export(df_participants, "output/sgc4c_participants.rds") # to R data structure file
rio::export(df_items, "output/sgc4c_items.rds") # to R data structure file